{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Advanced Read Operations\n",
    "\n",
    "\n",
    "ParquetDB offers a `read` method that enables a range of advanced data retrieval options. These options include:\n",
    "- Filtering data by column values (predicate pushdown),\n",
    "- Selecting or excluding specific columns,\n",
    "- Batch processing for optimized memory usage,\n",
    "- Rebuilding nested structures for hierarchical data, and more.\n",
    "\n",
    "In this notebook, we will explore how to leverage **advanced read** capabilities to efficiently retrieve data that meets your exact requirements.\n",
    "\n",
    "\n",
    "\n",
    "The `read` method has the following signature:\n",
    "\n",
    "```python\n",
    "def read(\n",
    "    self,\n",
    "    ids: List[int] = None,\n",
    "    columns: List[str] = None,\n",
    "    filters: List[pc.Expression] = None,\n",
    "    load_format: str = \"table\",\n",
    "    batch_size: int = None,\n",
    "    include_cols: bool = True,\n",
    "    rebuild_nested_struct: bool = False,\n",
    "    rebuild_nested_from_scratch: bool = False,\n",
    "    load_config: LoadConfig = LoadConfig(),\n",
    "    normalize_config: NormalizeConfig = NormalizeConfig(),\n",
    ") -> Union[pa.Table, Generator, Any]:\n",
    "    ...\n",
    "```"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Generated Data:\n",
      "{'column_0': 'test_64',\n",
      " 'column_1': 'test_83',\n",
      " 'column_10': 'test_71',\n",
      " 'column_100': 'test_68',\n",
      " 'column_101': 'test_65',\n",
      " 'column_102': 'test_67',\n",
      " 'column_103': 'test_60',\n",
      " 'column_104': 'test_73',\n",
      " 'column_105': 'test_10',\n",
      " 'column_106': 'test_74',\n",
      " 'column_107': 'test_73',\n",
      " 'column_108': 'test_60',\n",
      " 'column_109': 'test_58',\n",
      " 'column_11': 'test_39',\n",
      " 'column_110': 'test_100',\n",
      " 'column_111': 'test_18',\n",
      " 'column_112': 'test_42',\n",
      " 'column_113': 'test_62',\n",
      " 'column_114': 'test_58',\n",
      " 'column_115': 'test_16',\n",
      " 'column_116': 'test_84',\n",
      " 'column_117': 'test_32',\n",
      " 'column_118': 'test_11',\n",
      " 'column_119': 'test_28',\n",
      " 'column_12': 'test_73',\n",
      " 'column_120': 'test_49',\n",
      " 'column_121': 'test_12',\n",
      " 'column_122': 'test_22',\n",
      " 'column_123': 'test_52',\n",
      " 'column_124': 'test_37',\n",
      " 'column_125': 'test_1',\n",
      " 'column_126': 'test_47',\n",
      " 'column_127': 'test_37',\n",
      " 'column_128': 'test_12',\n",
      " 'column_129': 'test_15',\n",
      " 'column_13': 'test_36',\n",
      " 'column_130': 'test_35',\n",
      " 'column_131': 'test_54',\n",
      " 'column_132': 'test_34',\n",
      " 'column_133': 'test_94',\n",
      " 'column_134': 'test_65',\n",
      " 'column_135': 'test_42',\n",
      " 'column_136': 'test_10',\n",
      " 'column_137': 'test_57',\n",
      " 'column_138': 'test_68',\n",
      " 'column_139': 'test_31',\n",
      " 'column_14': 'test_24',\n",
      " 'column_140': 'test_4',\n",
      " 'column_141': 'test_34',\n",
      " 'column_142': 'test_36',\n",
      " 'column_143': 'test_73',\n",
      " 'column_144': 'test_2',\n",
      " 'column_145': 'test_12',\n",
      " 'column_146': 'test_22',\n",
      " 'column_147': 'test_5',\n",
      " 'column_148': 'test_74',\n",
      " 'column_149': 'test_78',\n",
      " 'column_15': 'test_100',\n",
      " 'column_150': 'test_60',\n",
      " 'column_151': 'test_1',\n",
      " 'column_152': 'test_75',\n",
      " 'column_153': 'test_48',\n",
      " 'column_154': 'test_52',\n",
      " 'column_155': 'test_40',\n",
      " 'column_156': 'test_14',\n",
      " 'column_157': 'test_66',\n",
      " 'column_158': 'test_65',\n",
      " 'column_159': 'test_76',\n",
      " 'column_16': 'test_37',\n",
      " 'column_160': 'test_52',\n",
      " 'column_161': 'test_8',\n",
      " 'column_162': 'test_41',\n",
      " 'column_163': 'test_76',\n",
      " 'column_164': 'test_22',\n",
      " 'column_165': 'test_92',\n",
      " 'column_166': 'test_70',\n",
      " 'column_167': 'test_67',\n",
      " 'column_168': 'test_67',\n",
      " 'column_169': 'test_74',\n",
      " 'column_17': 'test_30',\n",
      " 'column_170': 'test_86',\n",
      " 'column_171': 'test_5',\n",
      " 'column_172': 'test_36',\n",
      " 'column_173': 'test_56',\n",
      " 'column_174': 'test_32',\n",
      " 'column_175': 'test_25',\n",
      " 'column_176': 'test_29',\n",
      " 'column_177': 'test_78',\n",
      " 'column_178': 'test_91',\n",
      " 'column_179': 'test_75',\n",
      " 'column_18': 'test_29',\n",
      " 'column_180': 'test_30',\n",
      " 'column_181': 'test_15',\n",
      " 'column_182': 'test_21',\n",
      " 'column_183': 'test_37',\n",
      " 'column_184': 'test_57',\n",
      " 'column_185': 'test_63',\n",
      " 'column_186': 'test_82',\n",
      " 'column_187': 'test_49',\n",
      " 'column_188': 'test_60',\n",
      " 'column_189': 'test_32',\n",
      " 'column_19': 'test_66',\n",
      " 'column_190': 'test_69',\n",
      " 'column_191': 'test_56',\n",
      " 'column_192': 'test_77',\n",
      " 'column_193': 'test_25',\n",
      " 'column_194': 'test_4',\n",
      " 'column_195': 'test_24',\n",
      " 'column_196': 'test_89',\n",
      " 'column_197': 'test_29',\n",
      " 'column_198': 'test_10',\n",
      " 'column_199': 'test_75',\n",
      " 'column_2': 'test_51',\n",
      " 'column_20': 'test_44',\n",
      " 'column_200': 'test_76',\n",
      " 'column_201': 'test_1',\n",
      " 'column_202': 'test_7',\n",
      " 'column_203': 'test_35',\n",
      " 'column_204': 'test_33',\n",
      " 'column_205': 'test_91',\n",
      " 'column_206': 'test_83',\n",
      " 'column_207': 'test_3',\n",
      " 'column_208': 'test_91',\n",
      " 'column_209': 'test_63',\n",
      " 'column_21': 'test_9',\n",
      " 'column_210': 'test_3',\n",
      " 'column_211': 'test_45',\n",
      " 'column_212': 'test_88',\n",
      " 'column_213': 'test_4',\n",
      " 'column_214': 'test_90',\n",
      " 'column_215': 'test_66',\n",
      " 'column_216': 'test_16',\n",
      " 'column_217': 'test_37',\n",
      " 'column_218': 'test_84',\n",
      " 'column_219': 'test_82',\n",
      " 'column_22': 'test_2',\n",
      " 'column_220': 'test_96',\n",
      " 'column_221': 'test_3',\n",
      " 'column_222': 'test_4',\n",
      " 'column_223': 'test_28',\n",
      " 'column_224': 'test_79',\n",
      " 'column_225': 'test_50',\n",
      " 'column_226': 'test_67',\n",
      " 'column_227': 'test_23',\n",
      " 'column_228': 'test_96',\n",
      " 'column_229': 'test_79',\n",
      " 'column_23': 'test_24',\n",
      " 'column_230': 'test_26',\n",
      " 'column_231': 'test_61',\n",
      " 'column_232': 'test_82',\n",
      " 'column_233': 'test_38',\n",
      " 'column_234': 'test_6',\n",
      " 'column_235': 'test_33',\n",
      " 'column_236': 'test_16',\n",
      " 'column_237': 'test_92',\n",
      " 'column_238': 'test_1',\n",
      " 'column_239': 'test_6',\n",
      " 'column_24': 'test_19',\n",
      " 'column_240': 'test_86',\n",
      " 'column_241': 'test_28',\n",
      " 'column_242': 'test_38',\n",
      " 'column_243': 'test_82',\n",
      " 'column_244': 'test_8',\n",
      " 'column_245': 'test_41',\n",
      " 'column_246': 'test_34',\n",
      " 'column_247': 'test_11',\n",
      " 'column_248': 'test_13',\n",
      " 'column_249': 'test_36',\n",
      " 'column_25': 'test_36',\n",
      " 'column_250': 'test_14',\n",
      " 'column_251': 'test_28',\n",
      " 'column_252': 'test_87',\n",
      " 'column_253': 'test_70',\n",
      " 'column_254': 'test_28',\n",
      " 'column_255': 'test_4',\n",
      " 'column_256': 'test_68',\n",
      " 'column_257': 'test_38',\n",
      " 'column_258': 'test_60',\n",
      " 'column_259': 'test_95',\n",
      " 'column_26': 'test_92',\n",
      " 'column_260': 'test_83',\n",
      " 'column_261': 'test_22',\n",
      " 'column_262': 'test_22',\n",
      " 'column_263': 'test_26',\n",
      " 'column_264': 'test_87',\n",
      " 'column_265': 'test_88',\n",
      " 'column_266': 'test_60',\n",
      " 'column_267': 'test_96',\n",
      " 'column_268': 'test_73',\n",
      " 'column_269': 'test_79',\n",
      " 'column_27': 'test_6',\n",
      " 'column_270': 'test_13',\n",
      " 'column_271': 'test_63',\n",
      " 'column_272': 'test_14',\n",
      " 'column_273': 'test_44',\n",
      " 'column_274': 'test_5',\n",
      " 'column_275': 'test_80',\n",
      " 'column_276': 'test_89',\n",
      " 'column_277': 'test_52',\n",
      " 'column_278': 'test_8',\n",
      " 'column_279': 'test_82',\n",
      " 'column_28': 'test_88',\n",
      " 'column_280': 'test_86',\n",
      " 'column_281': 'test_22',\n",
      " 'column_282': 'test_90',\n",
      " 'column_283': 'test_94',\n",
      " 'column_284': 'test_9',\n",
      " 'column_285': 'test_15',\n",
      " 'column_286': 'test_96',\n",
      " 'column_287': 'test_65',\n",
      " 'column_288': 'test_91',\n",
      " 'column_289': 'test_82',\n",
      " 'column_29': 'test_69',\n",
      " 'column_290': 'test_60',\n",
      " 'column_291': 'test_3',\n",
      " 'column_292': 'test_16',\n",
      " 'column_293': 'test_11',\n",
      " 'column_294': 'test_93',\n",
      " 'column_295': 'test_91',\n",
      " 'column_296': 'test_83',\n",
      " 'column_297': 'test_80',\n",
      " 'column_298': 'test_68',\n",
      " 'column_299': 'test_88',\n",
      " 'column_3': 'test_22',\n",
      " 'column_30': 'test_75',\n",
      " 'column_300': 'test_75',\n",
      " 'column_301': 'test_63',\n",
      " 'column_302': 'test_7',\n",
      " 'column_303': 'test_97',\n",
      " 'column_304': 'test_75',\n",
      " 'column_305': 'test_35',\n",
      " 'column_306': 'test_20',\n",
      " 'column_307': 'test_72',\n",
      " 'column_308': 'test_29',\n",
      " 'column_309': 'test_43',\n",
      " 'column_31': 'test_57',\n",
      " 'column_310': 'test_33',\n",
      " 'column_311': 'test_36',\n",
      " 'column_312': 'test_44',\n",
      " 'column_313': 'test_70',\n",
      " 'column_314': 'test_35',\n",
      " 'column_315': 'test_10',\n",
      " 'column_316': 'test_38',\n",
      " 'column_317': 'test_71',\n",
      " 'column_318': 'test_15',\n",
      " 'column_319': 'test_95',\n",
      " 'column_32': 'test_90',\n",
      " 'column_320': 'test_53',\n",
      " 'column_321': 'test_52',\n",
      " 'column_322': 'test_80',\n",
      " 'column_323': 'test_92',\n",
      " 'column_324': 'test_25',\n",
      " 'column_325': 'test_96',\n",
      " 'column_326': 'test_76',\n",
      " 'column_327': 'test_30',\n",
      " 'column_328': 'test_72',\n",
      " 'column_329': 'test_68',\n",
      " 'column_33': 'test_69',\n",
      " 'column_330': 'test_26',\n",
      " 'column_331': 'test_39',\n",
      " 'column_332': 'test_86',\n",
      " 'column_333': 'test_99',\n",
      " 'column_334': 'test_15',\n",
      " 'column_335': 'test_48',\n",
      " 'column_336': 'test_91',\n",
      " 'column_337': 'test_4',\n",
      " 'column_338': 'test_64',\n",
      " 'column_339': 'test_32',\n",
      " 'column_34': 'test_74',\n",
      " 'column_340': 'test_20',\n",
      " 'column_341': 'test_37',\n",
      " 'column_342': 'test_71',\n",
      " 'column_343': 'test_70',\n",
      " 'column_344': 'test_65',\n",
      " 'column_345': 'test_78',\n",
      " 'column_346': 'test_3',\n",
      " 'column_347': 'test_80',\n",
      " 'column_348': 'test_94',\n",
      " 'column_349': 'test_25',\n",
      " 'column_35': 'test_93',\n",
      " 'column_350': 'test_40',\n",
      " 'column_351': 'test_3',\n",
      " 'column_352': 'test_68',\n",
      " 'column_353': 'test_76',\n",
      " 'column_354': 'test_39',\n",
      " 'column_355': 'test_60',\n",
      " 'column_356': 'test_2',\n",
      " 'column_357': 'test_19',\n",
      " 'column_358': 'test_90',\n",
      " 'column_359': 'test_65',\n",
      " 'column_36': 'test_36',\n",
      " 'column_360': 'test_53',\n",
      " 'column_361': 'test_41',\n",
      " 'column_362': 'test_67',\n",
      " 'column_363': 'test_13',\n",
      " 'column_364': 'test_39',\n",
      " 'column_365': 'test_66',\n",
      " 'column_366': 'test_43',\n",
      " 'column_367': 'test_36',\n",
      " 'column_368': 'test_22',\n",
      " 'column_369': 'test_36',\n",
      " 'column_37': 'test_96',\n",
      " 'column_370': 'test_66',\n",
      " 'column_371': 'test_100',\n",
      " 'column_372': 'test_96',\n",
      " 'column_373': 'test_21',\n",
      " 'column_374': 'test_19',\n",
      " 'column_375': 'test_32',\n",
      " 'column_376': 'test_37',\n",
      " 'column_377': 'test_34',\n",
      " 'column_378': 'test_5',\n",
      " 'column_379': 'test_13',\n",
      " 'column_38': 'test_49',\n",
      " 'column_380': 'test_39',\n",
      " 'column_381': 'test_20',\n",
      " 'column_382': 'test_98',\n",
      " 'column_383': 'test_69',\n",
      " 'column_384': 'test_13',\n",
      " 'column_385': 'test_55',\n",
      " 'column_386': 'test_37',\n",
      " 'column_387': 'test_77',\n",
      " 'column_388': 'test_20',\n",
      " 'column_389': 'test_31',\n",
      " 'column_39': 'test_76',\n",
      " 'column_390': 'test_16',\n",
      " 'column_391': 'test_20',\n",
      " 'column_392': 'test_43',\n",
      " 'column_393': 'test_81',\n",
      " 'column_394': 'test_24',\n",
      " 'column_395': 'test_63',\n",
      " 'column_396': 'test_35',\n",
      " 'column_397': 'test_94',\n",
      " 'column_398': 'test_63',\n",
      " 'column_399': 'test_91',\n",
      " 'column_4': 'test_6',\n",
      " 'column_40': 'test_66',\n",
      " 'column_400': 'test_87',\n",
      " 'column_401': 'test_32',\n",
      " 'column_402': 'test_71',\n",
      " 'column_403': 'test_64',\n",
      " 'column_404': 'test_93',\n",
      " 'column_405': 'test_52',\n",
      " 'column_406': 'test_84',\n",
      " 'column_407': 'test_43',\n",
      " 'column_408': 'test_63',\n",
      " 'column_409': 'test_94',\n",
      " 'column_41': 'test_14',\n",
      " 'column_410': 'test_64',\n",
      " 'column_411': 'test_32',\n",
      " 'column_412': 'test_85',\n",
      " 'column_413': 'test_26',\n",
      " 'column_414': 'test_22',\n",
      " 'column_415': 'test_47',\n",
      " 'column_416': 'test_65',\n",
      " 'column_417': 'test_85',\n",
      " 'column_418': 'test_65',\n",
      " 'column_419': 'test_18',\n",
      " 'column_42': 'test_70',\n",
      " 'column_420': 'test_41',\n",
      " 'column_421': 'test_3',\n",
      " 'column_422': 'test_28',\n",
      " 'column_423': 'test_38',\n",
      " 'column_424': 'test_30',\n",
      " 'column_425': 'test_69',\n",
      " 'column_426': 'test_52',\n",
      " 'column_427': 'test_86',\n",
      " 'column_428': 'test_95',\n",
      " 'column_429': 'test_16',\n",
      " 'column_43': 'test_43',\n",
      " 'column_430': 'test_89',\n",
      " 'column_431': 'test_37',\n",
      " 'column_432': 'test_10',\n",
      " 'column_433': 'test_62',\n",
      " 'column_434': 'test_12',\n",
      " 'column_435': 'test_30',\n",
      " 'column_436': 'test_49',\n",
      " 'column_437': 'test_54',\n",
      " 'column_438': 'test_34',\n",
      " 'column_439': 'test_47',\n",
      " 'column_44': 'test_75',\n",
      " 'column_440': 'test_74',\n",
      " 'column_441': 'test_58',\n",
      " 'column_442': 'test_75',\n",
      " 'column_443': 'test_98',\n",
      " 'column_444': 'test_53',\n",
      " 'column_445': 'test_27',\n",
      " 'column_446': 'test_70',\n",
      " 'column_447': 'test_64',\n",
      " 'column_448': 'test_48',\n",
      " 'column_449': 'test_85',\n",
      " 'column_45': 'test_89',\n",
      " 'column_450': 'test_40',\n",
      " 'column_451': 'test_94',\n",
      " 'column_452': 'test_51',\n",
      " 'column_453': 'test_14',\n",
      " 'column_454': 'test_63',\n",
      " 'column_455': 'test_63',\n",
      " 'column_456': 'test_72',\n",
      " 'column_457': 'test_85',\n",
      " 'column_458': 'test_20',\n",
      " 'column_459': 'test_88',\n",
      " 'column_46': 'test_50',\n",
      " 'column_460': 'test_86',\n",
      " 'column_461': 'test_43',\n",
      " 'column_462': 'test_60',\n",
      " 'column_463': 'test_29',\n",
      " 'column_464': 'test_28',\n",
      " 'column_465': 'test_47',\n",
      " 'column_466': 'test_52',\n",
      " 'column_467': 'test_75',\n",
      " 'column_468': 'test_40',\n",
      " 'column_469': 'test_18',\n",
      " 'column_47': 'test_30',\n",
      " 'column_470': 'test_83',\n",
      " 'column_471': 'test_38',\n",
      " 'column_472': 'test_62',\n",
      " 'column_473': 'test_43',\n",
      " 'column_474': 'test_50',\n",
      " 'column_475': 'test_26',\n",
      " 'column_476': 'test_6',\n",
      " 'column_477': 'test_75',\n",
      " 'column_478': 'test_76',\n",
      " 'column_479': 'test_81',\n",
      " 'column_48': 'test_25',\n",
      " 'column_480': 'test_58',\n",
      " 'column_481': 'test_81',\n",
      " 'column_482': 'test_37',\n",
      " 'column_483': 'test_29',\n",
      " 'column_484': 'test_18',\n",
      " 'column_485': 'test_69',\n",
      " 'column_486': 'test_88',\n",
      " 'column_487': 'test_43',\n",
      " 'column_488': 'test_90',\n",
      " 'column_489': 'test_51',\n",
      " 'column_49': 'test_3',\n",
      " 'column_490': 'test_55',\n",
      " 'column_491': 'test_70',\n",
      " 'column_492': 'test_9',\n",
      " 'column_493': 'test_1',\n",
      " 'column_494': 'test_64',\n",
      " 'column_495': 'test_31',\n",
      " 'column_496': 'test_64',\n",
      " 'column_497': 'test_71',\n",
      " 'column_498': 'test_22',\n",
      " 'column_499': 'test_3',\n",
      " 'column_5': 'test_81',\n",
      " 'column_50': 'test_76',\n",
      " 'column_51': 'test_80',\n",
      " 'column_52': 'test_19',\n",
      " 'column_53': 'test_77',\n",
      " 'column_54': 'test_16',\n",
      " 'column_55': 'test_78',\n",
      " 'column_56': 'test_100',\n",
      " 'column_57': 'test_10',\n",
      " 'column_58': 'test_54',\n",
      " 'column_59': 'test_49',\n",
      " 'column_6': 'test_56',\n",
      " 'column_60': 'test_45',\n",
      " 'column_61': 'test_7',\n",
      " 'column_62': 'test_65',\n",
      " 'column_63': 'test_92',\n",
      " 'column_64': 'test_31',\n",
      " 'column_65': 'test_44',\n",
      " 'column_66': 'test_90',\n",
      " 'column_67': 'test_12',\n",
      " 'column_68': 'test_21',\n",
      " 'column_69': 'test_83',\n",
      " 'column_7': 'test_24',\n",
      " 'column_70': 'test_41',\n",
      " 'column_71': 'test_83',\n",
      " 'column_72': 'test_75',\n",
      " 'column_73': 'test_86',\n",
      " 'column_74': 'test_46',\n",
      " 'column_75': 'test_99',\n",
      " 'column_76': 'test_13',\n",
      " 'column_77': 'test_37',\n",
      " 'column_78': 'test_94',\n",
      " 'column_79': 'test_8',\n",
      " 'column_8': 'test_29',\n",
      " 'column_80': 'test_7',\n",
      " 'column_81': 'test_54',\n",
      " 'column_82': 'test_94',\n",
      " 'column_83': 'test_85',\n",
      " 'column_84': 'test_98',\n",
      " 'column_85': 'test_87',\n",
      " 'column_86': 'test_44',\n",
      " 'column_87': 'test_33',\n",
      " 'column_88': 'test_44',\n",
      " 'column_89': 'test_95',\n",
      " 'column_9': 'test_88',\n",
      " 'column_90': 'test_23',\n",
      " 'column_91': 'test_36',\n",
      " 'column_92': 'test_80',\n",
      " 'column_93': 'test_1',\n",
      " 'column_94': 'test_78',\n",
      " 'column_95': 'test_67',\n",
      " 'column_96': 'test_20',\n",
      " 'column_97': 'test_43',\n",
      " 'column_98': 'test_49',\n",
      " 'column_99': 'test_85',\n",
      " 'float_field': 11,\n",
      " 'int_field': 6,\n",
      " 'list_field': [1, 2, 3],\n",
      " 'name': 'item_22',\n",
      " 'nested_value': {'name': 'item_13', 'value': 16}}\n",
      "============================================================\n",
      "PARQUETDB SUMMARY\n",
      "============================================================\n",
      "Database path: ParquetDB\n",
      "\n",
      "• Number of columns: 507\n",
      "• Number of rows: 100000\n",
      "• Number of files: 1\n",
      "• Number of rows per file: [100000]\n",
      "• Number of row groups per file: [4]\n",
      "• Serialized metadata size per file: [225147] Bytes\n",
      "\n",
      "############################################################\n",
      "METADATA\n",
      "############################################################\n",
      "\n",
      "############################################################\n",
      "COLUMN DETAILS\n",
      "############################################################\n",
      "• Columns:\n",
      "    - column_8\n",
      "    - column_191\n",
      "    - column_195\n",
      "    - column_261\n",
      "    - column_118\n",
      "    - column_135\n",
      "    - column_50\n",
      "    - column_156\n",
      "    - column_69\n",
      "    - column_498\n",
      "    - column_196\n",
      "    - column_358\n",
      "    - column_451\n",
      "    - column_49\n",
      "    - column_70\n",
      "    - column_492\n",
      "    - column_493\n",
      "    - column_288\n",
      "    - column_340\n",
      "    - column_327\n",
      "    - column_468\n",
      "    - column_88\n",
      "    - column_193\n",
      "    - column_283\n",
      "    - column_121\n",
      "    - column_409\n",
      "    - column_97\n",
      "    - column_174\n",
      "    - column_160\n",
      "    - column_318\n",
      "    - column_482\n",
      "    - column_291\n",
      "    - column_0\n",
      "    - column_23\n",
      "    - column_270\n",
      "    - column_279\n",
      "    - column_382\n",
      "    - column_230\n",
      "    - column_456\n",
      "    - column_184\n",
      "    - column_259\n",
      "    - column_457\n",
      "    - column_273\n",
      "    - column_296\n",
      "    - column_90\n",
      "    - column_280\n",
      "    - column_323\n",
      "    - column_57\n",
      "    - column_94\n",
      "    - column_439\n",
      "    - column_205\n",
      "    - column_253\n",
      "    - nested_value.value\n",
      "    - column_432\n",
      "    - column_206\n",
      "    - column_368\n",
      "    - column_369\n",
      "    - column_167\n",
      "    - column_117\n",
      "    - column_152\n",
      "    - column_349\n",
      "    - column_39\n",
      "    - column_164\n",
      "    - column_307\n",
      "    - column_257\n",
      "    - column_82\n",
      "    - column_157\n",
      "    - column_194\n",
      "    - column_18\n",
      "    - column_467\n",
      "    - column_290\n",
      "    - column_297\n",
      "    - column_221\n",
      "    - column_294\n",
      "    - column_374\n",
      "    - column_331\n",
      "    - column_225\n",
      "    - column_443\n",
      "    - column_154\n",
      "    - column_59\n",
      "    - column_44\n",
      "    - column_308\n",
      "    - column_491\n",
      "    - column_357\n",
      "    - column_386\n",
      "    - column_423\n",
      "    - column_465\n",
      "    - column_110\n",
      "    - column_306\n",
      "    - column_263\n",
      "    - column_240\n",
      "    - column_401\n",
      "    - column_115\n",
      "    - column_92\n",
      "    - column_223\n",
      "    - column_21\n",
      "    - column_168\n",
      "    - column_412\n",
      "    - column_190\n",
      "    - column_487\n",
      "    - column_43\n",
      "    - column_34\n",
      "    - column_24\n",
      "    - column_298\n",
      "    - column_362\n",
      "    - column_281\n",
      "    - column_309\n",
      "    - column_381\n",
      "    - column_67\n",
      "    - column_17\n",
      "    - column_311\n",
      "    - column_245\n",
      "    - column_106\n",
      "    - column_472\n",
      "    - column_131\n",
      "    - column_391\n",
      "    - column_399\n",
      "    - column_445\n",
      "    - column_435\n",
      "    - column_376\n",
      "    - column_434\n",
      "    - column_344\n",
      "    - column_338\n",
      "    - column_202\n",
      "    - column_5\n",
      "    - column_350\n",
      "    - column_102\n",
      "    - column_217\n",
      "    - column_162\n",
      "    - column_212\n",
      "    - column_418\n",
      "    - column_178\n",
      "    - column_304\n",
      "    - column_353\n",
      "    - column_256\n",
      "    - column_53\n",
      "    - column_239\n",
      "    - column_469\n",
      "    - column_120\n",
      "    - column_12\n",
      "    - column_485\n",
      "    - column_124\n",
      "    - column_60\n",
      "    - column_11\n",
      "    - column_452\n",
      "    - column_9\n",
      "    - column_373\n",
      "    - column_100\n",
      "    - column_98\n",
      "    - column_20\n",
      "    - column_414\n",
      "    - column_47\n",
      "    - column_36\n",
      "    - column_268\n",
      "    - column_464\n",
      "    - column_198\n",
      "    - column_411\n",
      "    - column_436\n",
      "    - column_10\n",
      "    - column_228\n",
      "    - column_254\n",
      "    - column_158\n",
      "    - column_286\n",
      "    - column_182\n",
      "    - column_375\n",
      "    - column_475\n",
      "    - column_185\n",
      "    - column_244\n",
      "    - column_147\n",
      "    - column_260\n",
      "    - column_180\n",
      "    - column_315\n",
      "    - column_470\n",
      "    - id\n",
      "    - column_192\n",
      "    - column_285\n",
      "    - column_138\n",
      "    - column_163\n",
      "    - column_459\n",
      "    - column_287\n",
      "    - column_139\n",
      "    - column_243\n",
      "    - column_7\n",
      "    - column_3\n",
      "    - column_16\n",
      "    - column_177\n",
      "    - column_380\n",
      "    - column_219\n",
      "    - column_483\n",
      "    - column_295\n",
      "    - column_48\n",
      "    - column_32\n",
      "    - column_366\n",
      "    - column_430\n",
      "    - column_40\n",
      "    - column_54\n",
      "    - column_61\n",
      "    - column_330\n",
      "    - column_495\n",
      "    - column_275\n",
      "    - column_425\n",
      "    - column_132\n",
      "    - column_76\n",
      "    - column_200\n",
      "    - column_313\n",
      "    - column_301\n",
      "    - column_246\n",
      "    - column_326\n",
      "    - column_416\n",
      "    - column_19\n",
      "    - column_247\n",
      "    - column_406\n",
      "    - column_119\n",
      "    - column_277\n",
      "    - column_155\n",
      "    - column_30\n",
      "    - column_220\n",
      "    - column_354\n",
      "    - column_75\n",
      "    - column_499\n",
      "    - column_4\n",
      "    - column_89\n",
      "    - column_176\n",
      "    - column_255\n",
      "    - column_312\n",
      "    - column_413\n",
      "    - column_13\n",
      "    - column_395\n",
      "    - column_188\n",
      "    - column_91\n",
      "    - column_130\n",
      "    - float_field\n",
      "    - column_210\n",
      "    - column_461\n",
      "    - column_227\n",
      "    - column_148\n",
      "    - column_490\n",
      "    - column_146\n",
      "    - column_463\n",
      "    - nested_value.name\n",
      "    - column_37\n",
      "    - column_302\n",
      "    - column_410\n",
      "    - column_171\n",
      "    - column_222\n",
      "    - column_77\n",
      "    - column_232\n",
      "    - column_137\n",
      "    - column_359\n",
      "    - column_486\n",
      "    - column_27\n",
      "    - column_101\n",
      "    - column_474\n",
      "    - column_231\n",
      "    - column_62\n",
      "    - column_99\n",
      "    - column_363\n",
      "    - column_179\n",
      "    - column_404\n",
      "    - column_242\n",
      "    - column_329\n",
      "    - column_494\n",
      "    - column_497\n",
      "    - column_52\n",
      "    - column_63\n",
      "    - column_170\n",
      "    - column_25\n",
      "    - column_473\n",
      "    - column_6\n",
      "    - column_317\n",
      "    - column_265\n",
      "    - column_112\n",
      "    - column_437\n",
      "    - column_383\n",
      "    - column_144\n",
      "    - column_103\n",
      "    - column_142\n",
      "    - column_55\n",
      "    - column_258\n",
      "    - column_355\n",
      "    - column_428\n",
      "    - column_233\n",
      "    - column_333\n",
      "    - column_35\n",
      "    - column_324\n",
      "    - column_165\n",
      "    - column_56\n",
      "    - column_211\n",
      "    - column_45\n",
      "    - column_372\n",
      "    - column_397\n",
      "    - column_476\n",
      "    - column_134\n",
      "    - column_364\n",
      "    - column_111\n",
      "    - column_214\n",
      "    - column_293\n",
      "    - column_377\n",
      "    - column_81\n",
      "    - column_235\n",
      "    - column_419\n",
      "    - column_1\n",
      "    - column_421\n",
      "    - column_334\n",
      "    - column_28\n",
      "    - column_378\n",
      "    - column_332\n",
      "    - column_422\n",
      "    - column_189\n",
      "    - column_325\n",
      "    - column_447\n",
      "    - column_360\n",
      "    - column_116\n",
      "    - column_264\n",
      "    - column_237\n",
      "    - column_400\n",
      "    - column_319\n",
      "    - column_371\n",
      "    - column_159\n",
      "    - column_390\n",
      "    - column_384\n",
      "    - column_127\n",
      "    - column_31\n",
      "    - column_161\n",
      "    - column_133\n",
      "    - column_73\n",
      "    - column_209\n",
      "    - column_394\n",
      "    - column_454\n",
      "    - column_173\n",
      "    - column_83\n",
      "    - column_335\n",
      "    - column_278\n",
      "    - column_367\n",
      "    - column_450\n",
      "    - column_108\n",
      "    - column_93\n",
      "    - column_248\n",
      "    - column_208\n",
      "    - column_396\n",
      "    - column_453\n",
      "    - column_345\n",
      "    - column_388\n",
      "    - column_420\n",
      "    - column_356\n",
      "    - column_51\n",
      "    - column_109\n",
      "    - column_348\n",
      "    - column_303\n",
      "    - column_479\n",
      "    - column_274\n",
      "    - column_72\n",
      "    - column_387\n",
      "    - column_481\n",
      "    - column_370\n",
      "    - column_346\n",
      "    - column_305\n",
      "    - column_141\n",
      "    - column_417\n",
      "    - column_339\n",
      "    - column_444\n",
      "    - column_426\n",
      "    - column_462\n",
      "    - column_96\n",
      "    - column_86\n",
      "    - column_408\n",
      "    - column_114\n",
      "    - column_466\n",
      "    - column_65\n",
      "    - column_379\n",
      "    - column_398\n",
      "    - column_365\n",
      "    - column_129\n",
      "    - column_292\n",
      "    - column_402\n",
      "    - column_460\n",
      "    - column_2\n",
      "    - column_74\n",
      "    - column_440\n",
      "    - column_236\n",
      "    - column_80\n",
      "    - column_251\n",
      "    - column_392\n",
      "    - column_427\n",
      "    - column_336\n",
      "    - column_107\n",
      "    - column_15\n",
      "    - column_149\n",
      "    - column_238\n",
      "    - column_442\n",
      "    - column_128\n",
      "    - list_field\n",
      "    - column_458\n",
      "    - column_441\n",
      "    - column_289\n",
      "    - column_446\n",
      "    - column_218\n",
      "    - column_140\n",
      "    - column_215\n",
      "    - column_105\n",
      "    - column_26\n",
      "    - column_203\n",
      "    - column_351\n",
      "    - column_337\n",
      "    - column_424\n",
      "    - column_320\n",
      "    - column_29\n",
      "    - column_341\n",
      "    - column_385\n",
      "    - column_389\n",
      "    - column_85\n",
      "    - column_151\n",
      "    - column_299\n",
      "    - column_321\n",
      "    - column_438\n",
      "    - column_269\n",
      "    - column_14\n",
      "    - column_58\n",
      "    - column_407\n",
      "    - column_347\n",
      "    - column_224\n",
      "    - column_38\n",
      "    - column_496\n",
      "    - column_300\n",
      "    - column_271\n",
      "    - column_489\n",
      "    - column_122\n",
      "    - column_252\n",
      "    - column_207\n",
      "    - column_229\n",
      "    - column_187\n",
      "    - column_113\n",
      "    - column_204\n",
      "    - column_455\n",
      "    - column_316\n",
      "    - column_314\n",
      "    - column_199\n",
      "    - column_284\n",
      "    - column_415\n",
      "    - column_310\n",
      "    - column_478\n",
      "    - column_234\n",
      "    - column_84\n",
      "    - column_342\n",
      "    - column_169\n",
      "    - column_276\n",
      "    - column_183\n",
      "    - int_field\n",
      "    - column_143\n",
      "    - column_449\n",
      "    - column_431\n",
      "    - column_175\n",
      "    - column_181\n",
      "    - column_104\n",
      "    - column_123\n",
      "    - column_343\n",
      "    - column_136\n",
      "    - column_226\n",
      "    - column_471\n",
      "    - column_480\n",
      "    - column_322\n",
      "    - column_197\n",
      "    - column_145\n",
      "    - column_172\n",
      "    - column_125\n",
      "    - column_272\n",
      "    - column_328\n",
      "    - column_64\n",
      "    - column_429\n",
      "    - column_78\n",
      "    - column_126\n",
      "    - column_42\n",
      "    - column_262\n",
      "    - column_484\n",
      "    - column_46\n",
      "    - column_150\n",
      "    - column_79\n",
      "    - column_352\n",
      "    - column_241\n",
      "    - column_477\n",
      "    - column_33\n",
      "    - column_87\n",
      "    - column_201\n",
      "    - column_249\n",
      "    - column_216\n",
      "    - column_213\n",
      "    - column_71\n",
      "    - column_405\n",
      "    - column_448\n",
      "    - column_433\n",
      "    - column_361\n",
      "    - column_153\n",
      "    - column_267\n",
      "    - column_250\n",
      "    - column_166\n",
      "    - column_68\n",
      "    - column_66\n",
      "    - column_95\n",
      "    - column_488\n",
      "    - column_186\n",
      "    - column_393\n",
      "    - column_282\n",
      "    - name\n",
      "    - column_266\n",
      "    - column_22\n",
      "    - column_41\n",
      "    - column_403\n",
      "\n"
     ]
    }
   ],
   "source": [
    "import pprint\n",
    "import time\n",
    "import os\n",
    "import shutil\n",
    "import pyarrow as pa\n",
    "from parquetdb import ParquetDB, LoadConfig, NormalizeConfig\n",
    "from parquetdb.utils.general_utils import generate_similar_data\n",
    "\n",
    "# Define a simple template data entry\n",
    "template_dict = {\n",
    "    \"float_field\": 10,\n",
    "    \"int_field\": 10,\n",
    "    \"name\": \"item\",\n",
    "    \"nested_value\": {\"value\": 10, \"name\": \"item\"},\n",
    "    \"list_field\": [1, 2, 3],\n",
    "}\n",
    "for x in range(500):\n",
    "    template_dict[f\"column_{x}\"] = \"test\"\n",
    "\n",
    "template = [template_dict]\n",
    "\n",
    "# Generate multiple data entries\n",
    "num_entries = 100000  # Feel free to adjust this\n",
    "data = generate_similar_data(template, num_entries)\n",
    "\n",
    "print(\"Generated Data:\")\n",
    "pprint.pprint(data[0])\n",
    "\n",
    "db_path = \"ParquetDB\"\n",
    "if os.path.exists(db_path):\n",
    "    shutil.rmtree(db_path)\n",
    "db = ParquetDB(db_path=db_path)\n",
    "\n",
    "db.create(data)\n",
    "print(db)\n",
    "data = None"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Basic Read\n",
    "\n",
    "A straightforward read operation without any parameters returns **all** data in a `pyarrow.Table` format (default).\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(100000, 507)\n",
      "Total memory allocated: 572.2216796875 MB\n",
      "Time taken to read all data: 0.3734889030456543 seconds\n"
     ]
    }
   ],
   "source": [
    "start_time = time.time()\n",
    "data = db.read()\n",
    "print(data.shape)\n",
    "end_time = time.time()\n",
    "print(f\"Total memory allocated: {pa.total_allocated_bytes()/1024/1024} MB\")\n",
    "print(f\"Time taken to read all data: {end_time - start_time} seconds\")\n",
    "data = None"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Column Selection\n",
    "\n",
    "### Read Specific Columns\n",
    "\n",
    "To retrieve specific columns, you can pass a list of column names to the `columns` parameter."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(100000, 1)\n",
      "Total memory allocated: 0.77490234375 MB\n",
      "Time taken to read all data: 0.05100083351135254 seconds\n"
     ]
    }
   ],
   "source": [
    "start_time = time.time()\n",
    "data = db.read(columns=[\"int_field\"])\n",
    "print(data.shape)\n",
    "end_time = time.time()\n",
    "print(f\"Total memory allocated: {pa.total_allocated_bytes()/1024/1024} MB\")\n",
    "print(f\"Time taken to read all data: {end_time - start_time} seconds\")\n",
    "data = None"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Read Nested Columns\n",
    "\n",
    "As you can see, using column selection it is much faster and less memory intensive as only the specified columns are loaded into memory.\n",
    "\n",
    "ParquetDB by defualts flattens nested structures. If you want the read fields from the nested structure you can specify it by using the syntax `nested_value.field_name`.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(100000, 1)\n",
      "Total memory allocated: 1.12738037109375 MB\n",
      "Time taken to read all data: 0.046912431716918945 seconds\n"
     ]
    }
   ],
   "source": [
    "start_time = time.time()\n",
    "data = db.read(columns=[\"nested_value.name\"])\n",
    "print(data.shape)\n",
    "end_time = time.time()\n",
    "print(f\"Total memory allocated: {pa.total_allocated_bytes()/1024/1024} MB\")\n",
    "print(f\"Time taken to read all data: {end_time - start_time} seconds\")\n",
    "data = None"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Do not read selected columns\n",
    "\n",
    "If you want to read all columns but not include the selected columns you can set the `include_cols` parameter to `False`.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(100000, 506)\n",
      "Total memory allocated: 571.6526489257812 MB\n",
      "Time taken to read all data: 0.35604023933410645 seconds\n"
     ]
    }
   ],
   "source": [
    "start_time = time.time()\n",
    "data = db.read(columns=[\"int_field\"], include_cols=False)\n",
    "print(data.shape)\n",
    "end_time = time.time()\n",
    "column_names = data.column_names\n",
    "assert \"int_field\" not in column_names\n",
    "print(f\"Total memory allocated: {pa.total_allocated_bytes()/1024/1024} MB\")\n",
    "print(f\"Time taken to read all data: {end_time - start_time} seconds\")\n",
    "data = None"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Filtering Data\n",
    "\n",
    "You can filter data by using the `filters` parameter. This parameter accepts a list of pyarrow expressions. ParquetDB will combine the filters using the `and` operator.\n",
    "\n",
    "> For more information on how to create pyarrow expressions, please refer to the [pyarrow documentation](https://arrow.apache.org/docs/python/compute.html#filtering-by-expressions)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(47684, 507)\n",
      "Total memory allocated: 425.08599853515625 MB\n",
      "Time taken to read all data: 0.4452495574951172 seconds\n"
     ]
    }
   ],
   "source": [
    "import pyarrow.compute as pc\n",
    "\n",
    "start_time = time.time()\n",
    "data = db.read(filters=[pc.field(\"int_field\") < 10])\n",
    "print(data.shape)\n",
    "end_time = time.time()\n",
    "print(f\"Total memory allocated: {pa.total_allocated_bytes()/1024/1024} MB\")\n",
    "print(f\"Time taken to read all data: {end_time - start_time} seconds\")\n",
    "data = None"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Again, since we do not read all the rows, the memory usage is much lower and the time taken is also faster.\n",
    "\n",
    "You can can still choose which columns to read while filtering."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(47684, 1)\n",
      "Total memory allocated: 1.09210205078125 MB\n",
      "Time taken to read all data: 0.03300309181213379 seconds\n"
     ]
    }
   ],
   "source": [
    "start_time = time.time()\n",
    "data = db.read(columns=[\"nested_value.name\"], filters=[pc.field(\"int_field\") < 10])\n",
    "print(data.shape)\n",
    "end_time = time.time()\n",
    "print(f\"Total memory allocated: {pa.total_allocated_bytes()/1024/1024} MB\")\n",
    "print(f\"Time taken to read all data: {end_time - start_time} seconds\")\n",
    "data = None"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Rebuilding Nested Structures\n",
    "\n",
    "ParquetDB by default flattens nested structures. If you want to read the nested structure to read the nested data in its original form, you can set the `rebuild_nested_struct` parameter to `True`. This will create a new files that will contain the nested data. \n",
    "\n",
    "> Note: direct updates, creates, or deletes to the nested data will not be reflected in the original files. You will have to rebuild the nested structure from scratch. by using the `rebuild_nested_from_scratch` parameter.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(100000, 506)\n",
      "Total memory allocated: 528.3989868164062 MB\n",
      "Time taken to read all data: 0.29286813735961914 seconds\n"
     ]
    }
   ],
   "source": [
    "start_time = time.time()\n",
    "data = db.read(rebuild_nested_struct=True)\n",
    "print(data.shape)\n",
    "column_names = data.column_names\n",
    "assert \"nested_value.name\" not in column_names\n",
    "assert \"nested_value.value\" not in column_names\n",
    "assert \"nested_value\" in column_names\n",
    "end_time = time.time()\n",
    "print(f\"Total memory allocated: {pa.total_allocated_bytes()/1024/1024} MB\")\n",
    "print(f\"Time taken to read all data: {end_time - start_time} seconds\")\n",
    "data = None"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Since the data is in nested format, to read specific nested fields slightly change. Now you can directly call the parent field and it will return all children fields. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(100000, 1)\n",
      "                       nested_value\n",
      "0  {'name': 'item_13', 'value': 16}\n",
      "1  {'name': 'item_83', 'value': 13}\n",
      "2  {'name': 'item_27', 'value': 14}\n",
      "3  {'name': 'item_20', 'value': 20}\n",
      "4  {'name': 'item_87', 'value': 12}\n",
      "Time taken to read all data: 0.06846165657043457 seconds\n"
     ]
    }
   ],
   "source": [
    "start_time = time.time()\n",
    "data = db.read(columns=[\"nested_value\"], rebuild_nested_struct=True).to_pandas()\n",
    "print(data.shape)\n",
    "column_names = data.columns\n",
    "print(data.head())\n",
    "assert \"nested_value\" in column_names\n",
    "end_time = time.time()\n",
    "print(f\"Time taken to read all data: {end_time - start_time} seconds\")\n",
    "data = None"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "You can still specify the child field in the previous form"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(100000, 1)\n",
      "      name\n",
      "0  item_13\n",
      "1  item_83\n",
      "2  item_27\n",
      "3  item_20\n",
      "4  item_87\n",
      "Time taken to read all data: 0.03645682334899902 seconds\n"
     ]
    }
   ],
   "source": [
    "start_time = time.time()\n",
    "data = db.read(columns=[\"nested_value.name\"], rebuild_nested_struct=True).to_pandas()\n",
    "print(data.shape)\n",
    "column_names = data.columns\n",
    "print(data.head())\n",
    "end_time = time.time()\n",
    "print(f\"Time taken to read all data: {end_time - start_time} seconds\")\n",
    "data = None"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "When the data is in nested format, creating pyarrow expressions for filtering is a bit different\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(1009, 1)\n",
      "      name\n",
      "0  item_20\n",
      "1  item_20\n",
      "2  item_20\n",
      "3  item_20\n",
      "4  item_20\n",
      "Time taken to read all data: 0.04416155815124512 seconds\n"
     ]
    }
   ],
   "source": [
    "start_time = time.time()\n",
    "data = db.read(\n",
    "    columns=[\"nested_value.name\"],\n",
    "    filters=[pc.field(\"nested_value\", \"name\") == \"item_20\"],\n",
    "    rebuild_nested_struct=True,\n",
    ").to_pandas()\n",
    "print(data.shape)\n",
    "column_names = data.columns\n",
    "print(data.head())\n",
    "end_time = time.time()\n",
    "print(f\"Time taken to read all data: {end_time - start_time} seconds\")\n",
    "data = None"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Batch Processing\n",
    "\n",
    "ParquetDB also supports batch processing. This is useful when you want to process data in smaller chunks to avoid memory issues. This can be done by setting the `load_format=\"batches\"` and setting the `batch_size` parameter.\n",
    "\n",
    "When this is used, the `read` method will return a generator that yields `pyarrow.RecordBatch` objects, which are similar to `pyarrow.Table` objects.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<_cython_3_0_10.generator object at 0x000002BC5A011CC0>\n",
      "Total memory allocated: 392.501708984375 MB\n",
      "Time taken to read all data: 0.019051790237426758 seconds\n"
     ]
    }
   ],
   "source": [
    "start_time = time.time()\n",
    "generator = db.read(load_format=\"batches\", batch_size=10000)\n",
    "print(generator)\n",
    "end_time = time.time()\n",
    "print(f\"Total memory allocated: {pa.total_allocated_bytes()/1024/1024} MB\")\n",
    "print(f\"Time taken to read all data: {end_time - start_time} seconds\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Now, you can iterate over the generator to get the data in batches."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(10000, 507)\n",
      "(10000, 507)\n",
      "(10000, 507)\n",
      "(2768, 507)\n",
      "(10000, 507)\n",
      "(10000, 507)\n",
      "(10000, 507)\n",
      "(2768, 507)\n",
      "(10000, 507)\n",
      "(10000, 507)\n",
      "(10000, 507)\n",
      "(2768, 507)\n",
      "(1696, 507)\n",
      "(10000, 507)\n",
      "(10000, 507)\n",
      "(10000, 507)\n",
      "(2768, 507)\n",
      "(10000, 507)\n",
      "(10000, 507)\n",
      "(10000, 507)\n",
      "(2768, 507)\n",
      "(10000, 507)\n",
      "(10000, 507)\n",
      "(10000, 507)\n",
      "(2768, 507)\n",
      "(1696, 507)\n"
     ]
    }
   ],
   "source": [
    "for record_batch in generator:\n",
    "    print(record_batch.shape)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "More configurations can be used to control the batching process by adding the `load_config` parameter which is an instance of `LoadConfig`.\n",
    "\n",
    "```python\n",
    "@dataclass\n",
    "class LoadConfig:\n",
    "    \"\"\"\n",
    "    Configuration for loading data, specifying columns, filters, batch size, and memory usage.\n",
    "\n",
    "    Parameters\n",
    "    ----------\n",
    "    batch_size : int, optional\n",
    "        The number of rows to process in each batch (default: 131_072).\n",
    "    batch_readahead : int, optional\n",
    "        The number of batches to read ahead in a file (default: 16).\n",
    "    fragment_readahead : int, optional\n",
    "        The number of files to read ahead, improving IO utilization at the cost of RAM usage (default: 4).\n",
    "    fragment_scan_options : Optional[pa.dataset.FragmentScanOptions], optional\n",
    "        Options specific to a particular scan and fragment type, potentially changing across scans.\n",
    "    use_threads : bool, optional\n",
    "        Whether to use maximum parallelism determined by available CPU cores (default: True).\n",
    "    memory_pool : Optional[pa.MemoryPool], optional\n",
    "        The memory pool for allocations. Defaults to the system's default memory pool.\n",
    "    \"\"\"\n",
    "\n",
    "    batch_size: int = 131_072\n",
    "    batch_readahead: int = 16\n",
    "    fragment_readahead: int = 4\n",
    "    fragment_scan_options: Optional[pa.dataset.FragmentScanOptions] = None\n",
    "    use_threads: bool = True\n",
    "    memory_pool: Optional[pa.MemoryPool] = None\n",
    "```\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Time taken to read all data when batch_readahead is 1: 0.6078670024871826 seconds\n",
      "Time taken to read all data when batch_readahead is 4: 0.5378696918487549 seconds\n"
     ]
    }
   ],
   "source": [
    "load_config = LoadConfig(batch_size=10000, batch_readahead=1)\n",
    "generator = db.read(load_format=\"batches\", load_config=load_config)\n",
    "start_time = time.time()\n",
    "for record_batch in generator:\n",
    "    batch = record_batch.shape\n",
    "end_time = time.time()\n",
    "print(\n",
    "    f\"Time taken to read all data when batch_readahead is 1: {end_time - start_time} seconds\"\n",
    ")\n",
    "\n",
    "load_config = LoadConfig(batch_size=10000, batch_readahead=16)\n",
    "generator = db.read(load_format=\"batches\", load_config=load_config)\n",
    "start_time = time.time()\n",
    "for record_batch in generator:\n",
    "    batch = record_batch.shape\n",
    "end_time = time.time()\n",
    "print(\n",
    "    f\"Time taken to read all data when batch_readahead is 4: {end_time - start_time} seconds\"\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This difference is small because this dataset is small, but it can be more noticeable for larger datasets."
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "parquetdb_dev",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.20"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
